# load libraries
suppressPackageStartupMessages({
library(scater)
library(SingleCellExperiment)
})
## Warning: package 'SummarizedExperiment' was built under R version 3.6.2
## Warning: package 'S4Vectors' was built under R version 3.6.2
## Warning: package 'IRanges' was built under R version 3.6.2
## Warning: package 'DelayedArray' was built under R version 3.6.2
## Warning: package 'BiocParallel' was built under R version 3.6.2
# load qc'd data
sce <- readRDS("data/sceQC.rds")
https://osca.bioconductor.org/normalization.html
Divide all counts for each cell by a cell-specific size factor. Assumes that all genes within a cell are affected equally.
Library size is the total sum of counts across all genes for each cell. Library size factor is library size, normalized so that mean size factor across all cells is 1.
libsize <- librarySizeFactors(sce)
summary(libsize)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.004238 0.386809 0.653320 1.000000 1.445357 10.214326
hist(libsize)
hist(log10(libsize))
Note: This normalization technique assumes that each cell has a similar total number of transcripts, ie upregulation of one set of genes means balanced downregulation of a different set of genes. When this is not the case, you will get composition bias and innacurate normalization values. This may not be an issue for exploratory analysis because you will still get the same clusters, just with inaccurate distances between clusters. Options to correct for composition bias include normalization by convolution, and spike-in normalization.
Scale by size factor, then log-transform values.
library(scran)
## Warning: package 'scran' was built under R version 3.6.2
set.seed(1000)
clusters <- quickCluster(sce)
table(clusters)
## clusters
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
## 803 122 147 1947 428 405 434 878 510 281 426 924 181 712 248 301
## 17 18
## 287 171
sce <- computeSumFactors(sce, cluster=clusters)
## Warning in FUN(...): encountered negative size factor estimates
sce <- logNormCounts(sce)
assayNames(sce)
## [1] "counts" "logcounts"
Select genes that contain biological information and remove genes that contain random noise
https://osca.bioconductor.org/feature-selection.html#quantifying-per-gene-variation
fit a trend to the variance with respect to abundance across all genes. This trend represents the technical noise.
dec <- modelGeneVar(sce)
dec
## DataFrame with 29286 rows and 6 columns
## mean total tech
## <numeric> <numeric> <numeric>
## A1BG 0.00131245178380456 0.00113470759936777 0.00173097527675417
## A1BG-AS1 0.0295184112917586 0.0446854224698095 0.038928882676635
## A1CF 0.00217581820793955 0.00646234816241916 0.00286965711777376
## A2M 0.186422484940104 0.365301778888915 0.244981524408179
## A2M-AS1 0.0556500680845831 0.0980280131825152 0.0733793013194764
## ... ... ... ...
## hsa-mir-7515 0.000314887234502944 0.000327095076897506 0.000415300659501429
## hsa-mir-8072 0.010563624069907 0.0170271419446693 0.0139321045309014
## snoR26 0.000564073951137366 0.00238711477296874 0.000743949750564854
## snoU2-30 0.00178432624538083 0.00304594899311063 0.00235332397457906
## snoZ40 0.000248721329966043 0.000569442471316782 0.00032803531353463
## bio p.value FDR
## <numeric> <numeric> <numeric>
## A1BG -0.000596267677386396 0.989975074786507 0.999999999779311
## A1BG-AS1 0.00575653979317453 0.159079031656266 0.486603571666508
## A1CF 0.0035926910446454 1.43650813404782e-17 2.21622854166644e-16
## A2M 0.120320254480736 0.000457319277195411 0.00284979117081208
## A2M-AS1 0.0246487118630388 0.0116761572338632 0.0548501251097815
## ... ... ... ...
## hsa-mir-7515 -8.82055826039235e-05 0.924182828828918 0.999999999779311
## hsa-mir-8072 0.00309503741376789 0.0668484776097457 0.246157897488013
## snoR26 0.00164316502240388 1.41166298624274e-50 5.90515127102321e-49
## snoU2-30 0.000692625018531564 0.0234692076399773 0.101257457996027
## snoZ40 0.000241407157782152 3.38279135696287e-07 2.99339861559819e-06
fit <- metadata(dec)
plot(fit$mean, fit$var, xlab="Mean of log-expression",
ylab="Variance of log-expression")
curve(fit$trend(x), col="dodgerblue", add=TRUE, lwd=2)
dec[order(dec$bio, decreasing=TRUE),]
## DataFrame with 29286 rows and 6 columns
## mean total tech bio
## <numeric> <numeric> <numeric> <numeric>
## PRG4 3.10819917121007 8.33680384390871 2.37808423194192 5.9587196119668
## COL3A1 2.20123153207015 6.71405611825128 2.02497816843305 4.68907794981823
## COL1A2 2.2100474212612 6.16657849628065 2.02758869918712 4.13898979709352
## PLA2G2A 1.86553399507151 5.99557532270241 1.87931731593029 4.11625800677211
## CD74 3.77147833986318 6.80794871481592 2.6996303137858 4.10831840103012
## ... ... ... ... ...
## RPL34 3.35028246456352 2.13102460889053 2.54104513460341 -0.410020525712887
## RPS6 3.50588094888864 2.18815796357584 2.60780307801807 -0.419645114442229
## RPL11 3.05049835525517 1.91141272087692 2.34772840964006 -0.436315688763137
## RPL23 2.74770929161865 1.76292907881772 2.22086443297396 -0.45793535415624
## GNAS 3.18199287465945 1.74240075628847 2.43178448404888 -0.689383727760414
## p.value FDR
## <numeric> <numeric>
## PRG4 1.74001102020133e-64 9.75651440321398e-63
## COL3A1 2.20078162909224e-55 1.00946928236954e-53
## COL1A2 1.67024073776431e-43 5.86892227236964e-42
## PLA2G2A 9.01066742549993e-50 3.67384300981459e-48
## CD74 4.6488489634771e-25 9.60491626817727e-24
## ... ... ...
## RPL34 0.86198672598418 0.999999999779311
## RPS6 0.861330960451216 0.999999999779311
## RPL11 0.895186693054653 0.999999999779311
## RPL23 0.918035263590258 0.999999999779311
## GNAS 0.972174104937347 0.999999999779311
This gives us an error…….
Suggestion is to use variance of log counts
cv <- modelGeneCV2(sce)
Separate biological noise from technical noise
You can use spike-ins but I don’t think we have spike-in data for some reason? The other option is to model gene variation based on a poisson distribution.
set.seed(0010101)
dec.pois <- modelGeneVarByPoisson(sce, block = sce$disease)
dec.pois[order(dec.pois$bio, decreasing=TRUE),]
https://osca.bioconductor.org/feature-selection.html#hvg-selection
This gives us the top 10% of highly variable genes
chosenGenes <- getTopHVGs(dec, prop=0.1)
str(chosenGenes)
## chr [1:1473] "PRG4" "COL3A1" "COL1A2" "PLA2G2A" "CD74" "IGHG4" "IGHG3" ...
sce.hvg <- sce[chosenGenes,]
sce.hvg
## class: SingleCellExperiment
## dim: 1473 9205
## metadata(0):
## assays(2): counts logcounts
## rownames(1473): PRG4 COL3A1 ... CKS2 PITPNC1
## rowData names(0):
## colnames: NULL
## colData names(36): cell_name barcode ... batch discard
## reducedDimNames(0):
## spikeNames(0):
## altExpNames(0):
saveRDS(sce.hvg, "data/sceHVG.rds")
sce <- readRDS("data/sceHVG.rds")
sce <- runPCA(sce)
reducedDims(sce)
## List of length 1
## names(1): PCA
dim(reducedDim(sce, "PCA"))
## [1] 9205 50
Find elbow point to decide on the number of PCs to keep for downstream analysis
# By elbow point
percentVar <- attr(reducedDim(sce), "percentVar")
elbow <- PCAtools::findElbowPoint(percentVar)
elbow
## [1] 5
plot(percentVar, xlab="PC", ylab="Variance explained (%)")
abline(v = elbow, col="red")
# Using technical noise
set.seed(111001001)
denoised <- denoisePCA(sce, technical=dec)
ncol(reducedDim(denoised))
## [1] 5
plotReducedDim(sce, dimred="PCA", colour_by="type")
plotReducedDim(sce, dimred="PCA", ncomponents=4,
colour_by="type")
set.seed(00101001101)
# runTSNE() stores the t-SNE coordinates in the reducedDims
# for re-use across multiple plotReducedDim() calls.
sce <- runTSNE(sce, dimred="PCA")
plotReducedDim(sce, dimred="TSNE", colour_by="type")
set.seed(1100101001)
sce <- runUMAP(sce, dimred="PCA")
plotReducedDim(sce, dimred="UMAP", colour_by="type")
reducedDims(sce)
## List of length 3
## names(3): PCA TSNE UMAP
saveRDS(sce, "data/scePrepped.rds")
Color points by other factors
plotReducedDim(sce, dimred = "TSNE", colour_by="sample")
plotReducedDim(sce, dimred = "TSNE", colour_by="plate")
plotReducedDim(sce, dimred = "TSNE", colour_by="disease")
plotReducedDim(sce, dimred = "TSNE", colour_by="molecules")